Topic modelling using gensim

Imports

Import dependencies



In [1]:

    
%%bash
ls | grep .csv









    



emails.csv
emails.csv.zip



In [2]:

    
# built-in libs
import email

# processing libs
import pandas as pd

# display libs
from tqdm import tqdm_notebook

Import data



In [3]:

    
emails_full_df = pd.read_csv('emails.csv', chunksize=10000)
emails_df = next(emails_full_df)



In [4]:

    
print(emails_df.shape)
emails_df.head()









    



(10000, 2)






    Out[4]:







  
    
      
      file
      message
    
  
  
    
      0
      allen-p/_sent_mail/1.
      Message-ID: <18782981.1075855378110.JavaMail.e...
    
    
      1
      allen-p/_sent_mail/10.
      Message-ID: <15464986.1075855378456.JavaMail.e...
    
    
      2
      allen-p/_sent_mail/100.
      Message-ID: <24216240.1075855687451.JavaMail.e...
    
    
      3
      allen-p/_sent_mail/1000.
      Message-ID: <13505866.1075863688222.JavaMail.e...
    
    
      4
      allen-p/_sent_mail/1001.
      Message-ID: <30922949.1075863688243.JavaMail.e...



In [5]:

    
emails_df.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 2 columns):
file       10000 non-null object
message    10000 non-null object
dtypes: object(2)
memory usage: 156.3+ KB



In [6]:

    
%time
messages_obj_lst = []
messages_str_lst = []

message_metadata = {}

for i in tqdm_notebook(range(emails_df.shape[0])):
    msg = email.message_from_string(emails_df.message[i])
    
    for msg_property in msg:
        if msg_property in message_metadata:
            message_metadata[msg_property][i] = msg[msg_property]
        else:
            message_metadata[msg_property] = ['N/A'] * emails_df.shape[0]
    
    payload = msg.get_payload() # decode=True
    
    messages_obj_lst.append(msg)
    messages_str_lst.append(payload) #.encode('utf-8').decode('unicode_escape')
    #except KeyboardInterrupt:
    #    break

print('messages_obj_lst size: %i' % len(messages_obj_lst))









    



CPU times: user 7 µs, sys: 2 µs, total: 9 µs
Wall time: 17.4 µs






    





 
 










    



messages_obj_lst size: 10000



In [7]:

    
# update dataframe object
# emails_df.rename(columns = {'message':'message_obj'}, inplace = True)
emails_df = emails_df.assign(message_obj = pd.Series(messages_obj_lst).values)
emails_df = emails_df.assign(payload     = pd.Series(messages_str_lst).values)

# print(emails_df.payload.str.contains(r'\\'))
emails_df['payload'] = emails_df.payload.str.replace(r'\n', '')



In [8]:

    
emails_df.head()









    Out[8]:







  
    
      
      file
      message
      message_obj
      payload
    
  
  
    
      0
      allen-p/_sent_mail/1.
      Message-ID: <18782981.1075855378110.JavaMail.e...
      [Message-ID, Date, From, To, Subject, Mime-Ver...
      Here is our forecast
    
    
      1
      allen-p/_sent_mail/10.
      Message-ID: <15464986.1075855378456.JavaMail.e...
      [Message-ID, Date, From, To, Subject, Mime-Ver...
      Traveling to have a business meeting takes the...
    
    
      2
      allen-p/_sent_mail/100.
      Message-ID: <24216240.1075855687451.JavaMail.e...
      [Message-ID, Date, From, To, Subject, Mime-Ver...
      test successful.  way to go!!!
    
    
      3
      allen-p/_sent_mail/1000.
      Message-ID: <13505866.1075863688222.JavaMail.e...
      [Message-ID, Date, From, To, Subject, Mime-Ver...
      Randy, Can you send me a schedule of the salar...
    
    
      4
      allen-p/_sent_mail/1001.
      Message-ID: <30922949.1075863688243.JavaMail.e...
      [Message-ID, Date, From, To, Subject, Mime-Ver...
      Let's shoot for Tuesday at 11:45.



In [9]:

    
for i in range(50):
    print(emails_df.message_obj[i]['Subject'])









    



Re:
Re: test

Re: Hello
Re: Hello

Re: PRC review - phone calls
Re: High Speed Internet Access
FW: fixed forward or other Collar floor gas price terms
Re: FW: fixed forward or other Collar floor gas price terms

Consolidated positions: Issues & To Do list
Consolidated positions: Issues & To Do list

Re: 2001 Margin Plan
Var, Reporting and Resources Meeting

Westgate
Meeting re: Storage Strategies in the West

Re: Not business related..
Re: Original Sept check/closing
San Juan Index
San Juan Index
Investment Structure
Investment Structure


Re: Gas Trading Vision meeting

Gas Physical/Financial Position
closing








Westgate Proforma-Phillip Allen.xls


Re: burnet

Re: Sept 1 Payment

Contact list for mid market



In [10]:

    
del messages_obj_lst
del messages_str_lst

emails_df.drop('message', axis=1, inplace=True)



In [ ]:



In [11]:

    
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression



In [12]:

    
train = emails_df[:7000]
test = emails_df[7000:]



In [13]:

    
trainheadlines = []
for row in range(0,len(train.index)):
    trainheadlines.append(emails_df.message_obj[row]['Subject']) # ' '.join(str(x))

trainheadlines = list(filter(None, trainheadlines))

[row for row in trainheadlines[:10]]









    Out[13]:





['Re:',
 'Re: test',
 'Re: Hello',
 'Re: Hello',
 'Re: PRC review - phone calls',
 'Re: High Speed Internet Access',
 'FW: fixed forward or other Collar floor gas price terms',
 'Re: FW: fixed forward or other Collar floor gas price terms',
 'Consolidated positions: Issues & To Do list',
 'Consolidated positions: Issues & To Do list']



In [14]:

    
# trainvect = CountVectorizer()
# Trainfeature = trainvect.fit_transform(trainheadlines)



In [15]:

    
# ####Detailed view of Document Count Matrix
# DTM_With_Colm = pd.DataFrame(Trainfeature.toarray(),columns= trainvect.get_feature_names())



In [16]:

    
# Trainfeature.shape



In [17]:

    
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim



In [18]:

    
%time
tokenizer = RegexpTokenizer(r'\w+')

# create English stop words list
en_stop = get_stop_words('en')

# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()
    
#Our Document
trainheadlines

# list for tokenized documents in loop
texts = []

# loop through document list
for i in trainheadlines:
    
    # clean and tokenize document string
    raw = i.lower()
    tokens = tokenizer.tokenize(raw)

    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in en_stop]
    
    # stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    
    # add tokens to list
    texts.append(stemmed_tokens)

# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)
    
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]









    



CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.53 µs



In [19]:

    
%time
#generate LDA
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=10, id2word = dictionary, passes=1,chunksize=10000,update_every=1)









    



CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.77 µs



In [20]:

    
%time
import pyLDAvis.gensim
print(ldamodel.print_topics(num_topics=10, num_words=3))









    



CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.77 µs
[(0, '0.180*"re" + 0.014*"ng" + 0.011*"credit"'), (1, '0.034*"re" + 0.028*"fw" + 0.020*"call"'), (2, '0.020*"meet" + 0.019*"ga" + 0.017*"fw"'), (3, '0.102*"re" + 0.033*"ga" + 0.023*"trade"'), (4, '0.162*"re" + 0.014*"meet" + 0.010*"market"'), (5, '0.057*"enron" + 0.051*"re" + 0.030*"mention"'), (6, '0.220*"re" + 0.015*"10" + 0.014*"fw"'), (7, '0.093*"re" + 0.020*"request" + 0.018*"com"'), (8, '0.093*"re" + 0.019*"01" + 0.017*"enron"'), (9, '0.050*"re" + 0.020*"fw" + 0.016*"new"')]



In [21]:

    
ldamodel.print_topics(5)









    Out[21]:





[(0,
  '0.180*"re" + 0.014*"ng" + 0.011*"credit" + 0.010*"fw" + 0.008*"view" + 0.008*"ga" + 0.007*"extra" + 0.006*"s" + 0.006*"execut" + 0.006*"request"'),
 (1,
  '0.034*"re" + 0.028*"fw" + 0.020*"call" + 0.013*"ga" + 0.012*"confer" + 0.012*"30" + 0.012*"10" + 0.011*"western" + 0.010*"meet" + 0.010*"12"'),
 (6,
  '0.220*"re" + 0.015*"10" + 0.014*"fw" + 0.010*"weekli" + 0.010*"daili" + 0.009*"2001" + 0.009*"chart" + 0.009*"ng" + 0.008*"survey" + 0.008*"p"'),
 (2,
  '0.020*"meet" + 0.019*"ga" + 0.017*"fw" + 0.016*"10" + 0.015*"re" + 0.015*"2001" + 0.014*"enron" + 0.013*"01" + 0.013*"report" + 0.011*"market"'),
 (3,
  '0.102*"re" + 0.033*"ga" + 0.023*"trade" + 0.016*"fw" + 0.016*"2001" + 0.012*"daili" + 0.011*"west" + 0.009*"meet" + 0.009*"chart" + 0.009*"updat"')]



In [22]:

    
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
news = pyLDAvis.gensim.prepare(ldamodel,corpus, dictionary)









    



/usr/local/lib/python3.5/dist-packages/pyLDAvis/_prepare.py:257: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

  return pd.concat([default_term_info] + list(topic_dfs))



In [23]:

    
news









    Out[23]:



In [ ]:



In [24]:

    
# %%bash
# nvidia-smi



In [ ]:

Bibliography

pip3 install stop-words
pip3 install pyLDAvis



In [ ]:

	file	message
0	allen-p/_sent_mail/1.	Message-ID: <18782981.1075855378110.JavaMail.e...
1	allen-p/_sent_mail/10.	Message-ID: <15464986.1075855378456.JavaMail.e...
2	allen-p/_sent_mail/100.	Message-ID: <24216240.1075855687451.JavaMail.e...
3	allen-p/_sent_mail/1000.	Message-ID: <13505866.1075863688222.JavaMail.e...
4	allen-p/_sent_mail/1001.	Message-ID: <30922949.1075863688243.JavaMail.e...

	file	message	message_obj	payload
0	allen-p/_sent_mail/1.	Message-ID: <18782981.1075855378110.JavaMail.e...	[Message-ID, Date, From, To, Subject, Mime-Ver...	Here is our forecast
1	allen-p/_sent_mail/10.	Message-ID: <15464986.1075855378456.JavaMail.e...	[Message-ID, Date, From, To, Subject, Mime-Ver...	Traveling to have a business meeting takes the...
2	allen-p/_sent_mail/100.	Message-ID: <24216240.1075855687451.JavaMail.e...	[Message-ID, Date, From, To, Subject, Mime-Ver...	test successful. way to go!!!
3	allen-p/_sent_mail/1000.	Message-ID: <13505866.1075863688222.JavaMail.e...	[Message-ID, Date, From, To, Subject, Mime-Ver...	Randy, Can you send me a schedule of the salar...
4	allen-p/_sent_mail/1001.	Message-ID: <30922949.1075863688243.JavaMail.e...	[Message-ID, Date, From, To, Subject, Mime-Ver...	Let's shoot for Tuesday at 11:45.